Lets first set some parameters
m <- 10000
nfoldsI <- 3
r <- 20
folds <- sample(1:nfoldsI, m, replace = TRUE)
adjustment_type <- "BH"
Now we create the betamix sample data according to Section 5.2 with two-dimensional covariates
mus_slope <- 1.5
prob_one_sided <- 0.25
Xs_r <- lapply(1:r, function(i){
Xs <- matrix(runif(m * 2, 0, 1), ncol = 2)
colnames(Xs) <- c("X1", "X2")
Xs
})
Hs_r <- lapply(1:r, function(i){
Xs <- Xs_r[[i]]
pi1s <- ifelse(Xs[, 1]^2 + Xs[, 2]^2 <= 1, 0.02, 0.4)
Hs <- stats::rbinom(m, size = 1, prob = pi1s)
Hs
})
Ps_r <- lapply(1:r, function(i){
Hs <- Hs_r[[i]]
Xs <- Xs_r[[i]]
mus <- pmax(1.3, sqrt(Xs) %*% c(1, 1) * mus_slope)
mu_alphas <- 1 / mus
Ps <- stats::runif(m) * (1 - Hs) + stats::rbeta(m, mu_alphas, 1) * Hs
Ps
})
To run regular IHW, we need to manually bin the covariates in 2d squares
bins1d <- seq(from = 0, to = 1, length.out = 6)
Xs_binned_r <- lapply(1:r, function(i){
Xs <- Xs_r[[i]]
Xs_binned <- data.frame(
X1 = cut(Xs[, 1], bins1d),
X2 = cut(Xs[, 2], bins1d)
)
Xs_binned <- apply(Xs_binned, 1, function(row) {
factor(paste(row[[1]], row[[2]], sep = "*"))
})
Xs_binned
})
We run the normal IHW with the binned covariates
ihw_no_forest_r <- lapply(1:r, function(i) {
Xs_binned <- Xs_binned_r[[i]]
Ps <- Ps_r[[i]]
ihw_no_forest <- ihw(Ps, Xs_binned, alpha = .1, folds = folds, use_forest = FALSE, adjustment_type = "BH", lp_solver = "lpsymphony", lambda = Inf)
})
Now we run the IHW Forest with 100 trees. For the Boca Leek tree we need to provide the censoring parameter tau. The exact value of tau should not be too important. We will revisit this issue in a later stage of the project.
tau <- 0.7
ihw_forest_r <- lapply(1:r, function(i) {
Xs <- Xs_r[[i]]
Ps <- Ps_r[[i]]
ihw_forest <- ihw(Ps, Xs, alpha = .1, folds = folds, use_forest = TRUE, ntrees = 100, tau = tau, lp_solver = "lpsymphony", lambda = Inf)
})
We see that for this example, that the forest structure increases power considerably.
fdp_compare_r <- lapply(1:r, function(i) {
Hs <- Hs_r[[i]]
ihw_no_forest <- ihw_no_forest_r[[i]]
ihw_forest <- ihw_forest_r[[i]]
fdp_eval_no_forest <- fdp_eval(Hs, IHW::rejected_hypotheses(ihw_no_forest))
fdp_eval_forest <- fdp_eval(Hs, IHW::rejected_hypotheses(ihw_forest))
fdp_compare <- rbind(
manually_cut = fdp_eval_no_forest,
use_forest = fdp_eval_forest
)
fdp_compare <- tibble::rownames_to_column(fdp_compare, "forest")
})
fdp_compare_r <- do.call(rbind, fdp_compare_r)
fdp_compare_r <- fdp_compare_r %>%
dplyr::group_by(forest) %>%
dplyr::summarise(
rjs = mean(rjs),
pow = mean(pow),
FDR = mean(FDP),
FWER = mean(FWER)
)
fdp_compare_r
## # A tibble: 2 x 5
## forest rjs pow FDR FWER
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 manually_cut 104. 0.0939 0.0773 1
## 2 use_forest 152. 0.136 0.0850 1
For completeness, let’s plot the weights in 2d as in Nikos’ draft of proof pdf
ihw_no_forest <- ihw_no_forest_r[[1]]
Xs <- Xs_r[[1]]
data <- data.frame(Xs, weight = ihw_no_forest@df[["weight"]])
ggplot(data = data, aes(x = X1, y = X2, color = weight)) +
geom_point()
ihw_forest <- ihw_forest_r[[1]]
data <- data.frame(Xs, weight = ihw_forest@df[["weight"]])
ggplot(data = data, aes(x = X1, y = X2, color = weight)) +
geom_point()